Still under constructions.
(III) Detailed List
- Read and load each line of source code of all the 250 movies.
- Add Content Rating, Number of Rater, Genre, Budget, Opening Weekend USA, Gross USA and Cumulative Worldwide Gross by reading each movie’s link.
- The data was collected on 2020-10-29.
# get source code of a single movie
h_get_movie_source_code = function(curr_movie_link) {
curr_movie_source_code = curr_movie_link %>%
readLines(encoding = "UTF-8")
return(curr_movie_source_code)
}
# get basic info json from the single movie source code
h_get_json_from_movie_source_code = function(movie_source_code) {
json_start_pattern = "<script type=\"application/ld\\+json\">\\{"
json_end_pattern = "\\}</script>"
json_start_line = movie_source_code %>%
grep(pattern = json_start_pattern)
json_end_line = movie_source_code %>%
grep(pattern = json_end_pattern) %>%
extract(1)
json_file = movie_source_code %>%
extract(json_start_line : json_end_line)
return(json_file)
}
# get box office info from the single movie source code
h_get_box_office_from_movie_source_code = function(movie_source_code) {
box_office_start_pattern = "<h3 class=\"subheading\">Box Office</h3>"
box_office_end_pattern = "<hr />"
box_office_start_line = curr_source_code %>%
grep(pattern = box_office_start_pattern)
lines_with_box_office_end_pattern = curr_source_code %>%
grep(pattern = box_office_end_pattern)
box_office_end_line = lines_with_box_office_end_pattern %>%
extract(lines_with_box_office_end_pattern %>%
is_greater_than(box_office_start_line) %>%
which() %>%
extract(1))
box_office = movie_source_code %>%
extract(box_office_start_line : box_office_end_line)
return(box_office)
}
curr_source_code = movie_link[1] %>%
h_get_movie_source_code()
curr_json = curr_source_code %>%
h_get_json_from_movie_source_code()
curr_box_office = curr_source_code %>%
h_get_box_office_from_movie_source_code()
curr_json %>% cat()
<script type="application/ld+json">{ "@context": "http://schema.org", "@type": "Movie", "url": "/title/tt0111161/", "name": "The Shawshank Redemption", "image": "https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_.jpg", "genre": "Drama", "contentRating": "R", "actor": [ { "@type": "Person", "url": "/name/nm0000209/", "name": "Tim Robbins" }, { "@type": "Person", "url": "/name/nm0000151/", "name": "Morgan Freeman" }, { "@type": "Person", "url": "/name/nm0348409/", "name": "Bob Gunton" }, { "@type": "Person", "url": "/name/nm0006669/", "name": "William Sadler" } ], "director": { "@type": "Person", "url": "/name/nm0001104/", "name": "Frank Darabont" }, "creator": [ { "@type": "Person", "url": "/name/nm0000175/", "name": "Stephen King" }, { "@type": "Person", "url": "/name/nm0001104/", "name": "Frank Darabont" }, { "@type": "Organization", "url": "/company/co0040620/" } ], "description": "The Shawshank Redemption is a movie starring Tim Robbins, Morgan Freeman, and Bob Gunton. Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.", "datePublished": "1994-09-23", "keywords": "wrongful imprisonment,based on the works of stephen king,prison,escape from prison,voice over narration", "aggregateRating": { "@type": "AggregateRating", "ratingCount": 2299160, "bestRating": "10.0", "worstRating": "1.0", "ratingValue": "9.3" }, "review": { "@type": "Review", "itemReviewed": { "@type": "CreativeWork", "url": "/title/tt0111161/" }, "author": { "@type": "Person", "name": "nowego" }, "dateCreated": "2018-06-15", "inLanguage": "English", "name": "Eternal Hope", "reviewBody": "I\u0027ve lost count of the number of times I have seen this movie, but it is more than 20. It has to be one of the best movies ever made. It made me take notice Morgan Freeman and Tim Robbins like I had never noticed any actors before.\n\nI have from a very young age been a huge fan of anything Stephen King writes and had already read the short story that this movie is based on years prior to seeing this movie.\n\nNot everything Stephen King has written that gets turned into a movie comes out well, but this is as close to perfection as it gets and has everything you could ever want in a movie.\n\nSomething that is outstanding is the fact that it has no real action, no special effects and no gimmicks. 99% of the movie is just men in a prison uniforms talking. Yet it absolutely hooks you almost from the beginning and has you glued to the screen to the end.\n\nFor me what really makes this film one of the best is the message of eternal hope it conveys throughout. The never ever give up hope attitude of the main character so well conveyed by Tim Robbins. The ending is just spine tingling every time I see it, no matter how many times I have seen it.\n\nBrilliant, brilliant movie and a must see for everyone.", "reviewRating": { "@type": "Rating", "worstRating": "1", "bestRating": "10", "ratingValue": "10" } }, "duration": "PT2H22M", "trailer": { "@type": "VideoObject", "name": "Official Trailer", "embedUrl": "/video/imdb/vi3877612057", "thumbnail": { "@type": "ImageObject", "contentUrl": "https://m.media-amazon.com/images/M/MV5BNjQ2NDA3MDcxMF5BMl5BanBnXkFtZTgwMjE5NTU0NzE@._V1_.jpg" }, "thumbnailUrl": "https://m.media-amazon.com/images/M/MV5BNjQ2NDA3MDcxMF5BMl5BanBnXkFtZTgwMjE5NTU0NzE@._V1_.jpg", "description": "Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.", "uploadDate": "2014-03-05T14:13:19Z" } }</script>
curr_box_office %>% cat()
<h3 class="subheading">Box Office</h3> <div class="txt-block"> <h4 class="inline">Budget:</h4>$25,000,000 <span class="attribute">(estimated)</span> </div> <div class="txt-block"> <h4 class="inline">Opening Weekend USA:</h4> $727,327, <span class="attribute">25 September 1994</span> </div> <div class="txt-block"> <h4 class="inline">Gross USA:</h4> $28,699,976 </div> <div class="txt-block"> <h4 class="inline">Cumulative Worldwide Gross:</h4> $28,815,291 </div> <span class="see-more inline"> <a href="https://pro.imdb.com/title/tt0111161?rf=cons_tt_bo_tt&ref_=cons_tt_bo_tt" >See more on IMDbPro</a> » </span> <hr />
| Title |
h1 itemprop="name" |
| Year |
Next line of Title |
| Content Rating |
meta itemprop="contentRating" |
| User Rating |
span itemprop="ratingValue" |
| Number of Rater |
itemprop="ratingCount" |
| Genre |
span class="itemprop" itemprop="genre" |
| Budget |
<h4 class="inline">Budget |
| Opening Weekend USA ($) |
<h4 class="inline">Opening Weekend USA |
| Gross USA ($) |
<h4 class="inline">Gross |
| Cumulative Worldwide Gross ($) |
<h4 class="inline">Cumulative |
#Design function to get target information from a single page
#Each input is a website link from `movie_link`
get.target.info=function(input){
temp=readLines(con=input,encoding="UTF-8")
#1. title----
temp.movie_title=temp[grep("h1 itemprop=\"name\"",temp)]
temp.movie_title=strsplit(temp.movie_title,split=">")[[1]][2]
temp.movie_title=strsplit(temp.movie_title,split="&")[[1]][1]
#2. year----
temp.movie_year=temp[grep("h1 itemprop=\"name\"",temp)+1]
temp.movie_year=strsplit(temp.movie_year,split=">")[[1]][2]
temp.movie_year=strsplit(temp.movie_year,split="<")[[1]][1]
#3. content rating----
temp.movie_content_rating=temp[grep("meta itemprop=\"contentRating\"",temp)]
if (length(temp.movie_content_rating)==1){
temp.movie_content_rating=strsplit(temp.movie_content_rating,split=">")[[1]][2]
}
if (length(temp.movie_content_rating)==0){
temp.movie_content_rating="-"
}
#4. user rating----
temp.movie_user_rating=temp[grep("span itemprop=\"ratingValue\"",temp)]
temp.movie_user_rating=strsplit(temp.movie_user_rating,split=">")[[1]][3]
temp.movie_user_rating=strsplit(temp.movie_user_rating,split="<")[[1]][1]
#5. number of rater----
temp.movie_num_rater=temp[grep("itemprop=\"ratingCount\"",temp)]
temp.movie_num_rater=strsplit(temp.movie_num_rater,split=">")[[1]][3]
temp.movie_num_rater=strsplit(temp.movie_num_rater,split="<")[[1]][1]
#6. genre----
temp.movie_genre=temp[grep("span class=\"itemprop\" itemprop=\"genre\"",temp)]
temp.movie_genre.l=length(temp.movie_genre)
for (i in 1:temp.movie_genre.l){
temp.movie_genre[[i]]=strsplit(temp.movie_genre,split=">")[[i]][3]
temp.movie_genre[[i]]=strsplit(temp.movie_genre,split="<")[[i]][1]
}
remove(i,temp.movie_genre.l)
temp.movie_genre=paste(temp.movie_genre,collapse=", ")
#7. budget----
temp.movie_budget=temp[grep("<h4 class=\"inline\">Budget",temp)]
if (length(temp.movie_budget)==1){
temp.movie_budget=strsplit(temp.movie_budget,split=">")[[1]][3]
a=strsplit(temp.movie_budget,split="")[[1]]
if (paste(a[1],a[2],a[3],sep="")=="FRF"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="JPY"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="INR"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="DEM"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="RUR"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="TRL"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="AUD"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],sep="")=="KRW"){
temp.movie_budget=paste(strsplit(temp.movie_budget,split=" ")[[1]][1],strsplit(temp.movie_budget,split=" ")[[1]][2],sep=" ")
}
if (paste(a[1],a[2],a[3],a[4],a[5],a[6],sep="")=="€"){
temp.movie_budget=paste("EUR",substr(temp.movie_budget,start=7,stop=nchar(temp.movie_budget)))
}
if (paste(a[1],a[2],a[3],a[4],a[5],a[6],a[7],sep="")=="£"){
temp.movie_budget=paste("GBP",substr(temp.movie_budget,start=8,stop=nchar(temp.movie_budget)))
}
remove(a)
}
if (length(temp.movie_budget)==0){
temp.movie_budget="-"
}
#8. opening----
temp.movie_opening=temp[grep("<h4 class=\"inline\">Opening Weekend USA",temp)]
if (length(temp.movie_opening)==1){
temp.movie_opening=strsplit(temp.movie_opening,split=">")[[1]][3]
temp.movie_opening=strsplit(temp.movie_opening,split=" ")[[1]][2]
a=strsplit(temp.movie_opening,split="")[[1]]
if (a[length(a)]==","){
temp.movie_opening=substr(temp.movie_opening,start=1,stop=nchar(temp.movie_opening)-1)
}
remove(a)
}
if (length(temp.movie_opening)==0){
temp.movie_opening="-"
}
#9. gross----
temp.movie_gross=temp[grep("<h4 class=\"inline\">Gross",temp)]
if (length(temp.movie_gross)==1){
temp.movie_gross=strsplit(temp.movie_gross,split=">")[[1]][3]
temp.movie_gross=strsplit(temp.movie_gross,split=" ")[[1]][2]
a=strsplit(temp.movie_gross,split="")[[1]]
if (a[length(a)]==","){
temp.movie_gross=substr(temp.movie_gross,start=1,stop=nchar(temp.movie_gross)-1)
}
remove(a)
}
if (length(temp.movie_gross)==0){
temp.movie_gross="-"
}
#10. worldwide gross----
temp.movie_worldwide_gross=temp[grep("<h4 class=\"inline\">Cumulative",temp)]
if (length(temp.movie_worldwide_gross)==1){
temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=">")[[1]][3]
temp.movie_worldwide_gross=strsplit(temp.movie_worldwide_gross,split=" ")[[1]][2]
a=strsplit(temp.movie_worldwide_gross,split="")[[1]]
if (a[length(a)]==","){
temp.movie_worldwide_gross=substr(temp.movie_worldwide_gross,start=1,stop=nchar(temp.movie_worldwide_gross)-1)
}
remove(a)
}
if (length(temp.movie_worldwide_gross)==0){
temp.movie_worldwide_gross="-"
}
#11. result----
return(c(temp.movie_title,temp.movie_year,temp.movie_content_rating,temp.movie_user_rating,temp.movie_num_rater,temp.movie_genre,temp.movie_budget,temp.movie_opening,temp.movie_gross,temp.movie_worldwide_gross))
}
#Collecting data----
movie_title=c()
movie_year=c()
movie_content_rating=c()
movie_user_rating=c()
movie_num_rater=c()
movie_genre=c()
movie_budget=c()
movie_opening=c()
movie_gross=c()
movie_worldwide_gross=c()
for (i in 1:250){
temp.target.info=get.target.info(movie_link[i])
movie_title=c(movie_title,temp.target.info[1])
movie_year=c(movie_year,temp.target.info[2])
movie_content_rating=c(movie_content_rating,temp.target.info[3])
movie_user_rating=c(movie_user_rating,temp.target.info[4])
movie_num_rater=c(movie_num_rater,temp.target.info[5])
movie_genre=c(movie_genre,temp.target.info[6])
movie_budget=c(movie_budget,temp.target.info[7])
movie_opening=c(movie_opening,temp.target.info[8])
movie_gross=c(movie_gross,temp.target.info[9])
movie_worldwide_gross=c(movie_worldwide_gross,temp.target.info[10])
}
#Visualization----
library(knitr)
y=data.frame(movie_rank,movie_title,movie_year,movie_content_rating,movie_user_rating,movie_num_rater,movie_genre,movie_budget,movie_opening,movie_gross,movie_worldwide_gross)
y$movie_rank=as.character(movie_rank)
y$movie_title=as.character(movie_title)
y$movie_year=as.character(movie_year)
y$movie_content_rating=as.character(movie_content_rating)
y$movie_user_rating=as.character(movie_user_rating)
y$movie_num_rater=as.character(movie_num_rater)
y$movie_genre=as.character(movie_genre)
y$movie_budget=as.character(movie_budget)
y$movie_opening=as.character(movie_opening)
y$movie_gross=as.character(movie_gross)
y$movie_worldwide_gross=as.character(movie_worldwide_gross)
kable(y,align="c",col.names=c("Rank","Title","Year","Content Rating","User Rating","Number of Rater","Genre","Budget","Opening Weekend USA","Gross USA","Cumulative Worldwide Gross"))